Personal Computer World 2009 February

home *** CD-ROM | disk | FTP | other *** search

/ Personal Computer World 2009 February / PCWFEB09.iso / Software / Resources / Chat & Communication / Digsby build 37 / digsby_setup.exe / lib / lxml / html / clean.pyo (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2008-10-13 | 13.4 KB | 579 lines

# Source Generated with Decompyle++ # File: in.pyo (Python 2.5) import re import copy try: from urlparse import urlsplit except ImportError: from urllib.parse import urlsplit from lxml import etree from lxml.html import defs from lxml.html import fromstring, tostring, XHTML_NAMESPACE from lxml.html import _nons, _transform_result try: set except NameError: from sets import Set as set try: unichr = __builtins__['unichr'] except (NameError, KeyError): unichr = chr try: unicode = __builtins__['unicode'] except (NameError, KeyError): unicode = str try: bytes = __builtins__['bytes'] except (NameError, KeyError): bytes = str try: basestring = __builtins__['basestring'] except (NameError, KeyError): basestring = (str, bytes) __all__ = [ 'clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 'word_break', 'word_break_html'] _css_javascript_re = re.compile('expression\\s*\$.*?\$', re.S | re.I) _css_import_re = re.compile('@\\s*import', re.I) _javascript_scheme_re = re.compile('\\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) _substitute_whitespace = re.compile('\\s+').sub _conditional_comment_re = re.compile('\\[if[\\s\\n\\r]+.*?][\\s\\n\\r]*>', re.I | re.S) _find_styled_elements = etree.XPath('descendant-or-self::*[@style]') _find_external_links = etree.XPath("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']", namespaces = { 'x': XHTML_NAMESPACE }) class Cleaner(object): scripts = True javascript = True comments = True style = False links = True meta = True page_structure = True processing_instructions = True embedded = True frames = True forms = True annoying_tags = True remove_tags = None allow_tags = None remove_unknown_tags = True safe_attrs_only = True add_nofollow = False host_whitelist = () whitelist_tags = set([ 'iframe', 'embed']) def __init__(self, **kw): for name, value in kw.items(): if not hasattr(self, name): raise TypeError('Unknown parameter: %s=%r' % (name, value)) setattr(self, name, value) _tag_link_attrs = dict(script = 'src', link = 'href', applet = [ 'code', 'object'], iframe = 'src', embed = 'src', layer = 'src', a = 'href') def __call__(self, doc): if hasattr(doc, 'getroot'): doc = doc.getroot() for el in doc.iter(): tag = el.tag if isinstance(tag, basestring): el.tag = _nons(tag) continue for el in doc.iter('image'): el.tag = 'img' if not self.comments: self.kill_conditional_comments(doc) kill_tags = set() if not self.remove_tags: pass remove_tags = set(()) if self.allow_tags: allow_tags = set(self.allow_tags) else: allow_tags = set() if self.scripts: kill_tags.add('script') if self.safe_attrs_only: safe_attrs = set(defs.safe_attrs) for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] continue if self.javascript: if not self.safe_attrs_only: for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname.startswith('on'): del attrib[aname] continue doc.rewrite_links(self._remove_javascript_link, resolve_base_href = False) if not self.style: for el in _find_styled_elements(doc): old = el.get('style') new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): del el.attrib['style'] continue if new != old: el.set('style', new) continue for el in list(doc.iter('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_tree() continue if not el.text: pass old = '' new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): el.text = '/* deleted */' continue if new != old: el.text = new continue if self.comments or self.processing_instructions: kill_tags.add(etree.Comment) if self.processing_instructions: kill_tags.add(etree.ProcessingInstruction) if self.style: kill_tags.add('style') for el in _find_styled_elements(doc): del el.attrib['style'] if self.links: kill_tags.add('link') elif self.style or self.javascript: for el in list(doc.iter('link')): if 'stylesheet' in el.get('rel', '').lower(): el.drop_tree() continue if self.meta: kill_tags.add('meta') if self.page_structure: remove_tags.update(('head', 'html', 'title')) if self.embedded: for el in list(doc.iter('param')): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ('applet', 'object'): parent = parent.getparent() if parent is None: el.drop_tree() continue kill_tags.update(('applet',)) remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) if self.frames: kill_tags.update(defs.frame_tags) if self.forms: remove_tags.add('form') kill_tags.update(('button', 'input', 'select', 'textarea')) if self.annoying_tags: remove_tags.update(('blink', 'marque')) _remove = [] _kill = [] for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue _kill.append(el) continue if el.tag in remove_tags: if self.allow_element(el): continue _remove.append(el) continue if _remove and _remove[0] == doc: el = _remove.pop(0) el.tag = 'div' el.attrib.clear() elif _kill and _kill[0] == doc: el = _kill.pop(0) if el.tag != 'html': el.tag = 'div' el.clear() for el in _kill: el.drop_tree() for el in _remove: el.drop_tag() allow_tags = self.allow_tags if self.remove_unknown_tags: if allow_tags: raise ValueError('It does not make sense to pass in both allow_tags and remove_unknown_tags') allow_tags = set(defs.tags) if allow_tags: bad = [] for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) continue for el in bad: el.drop_tag() if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): el.set('rel', 'nofollow') continue def allow_follow(self, anchor): return False def allow_element(self, el): if el.tag not in self._tag_link_attrs: return False attr = self._tag_link_attrs[el.tag] if isinstance(attr, (list, tuple)): for one_attr in attr: url = el.get(one_attr) if not url: return False if not self.allow_embedded_url(el, url): return False continue return True else: url = el.get(attr) if not url: return False return self.allow_embedded_url(el, url) def allow_embedded_url(self, el, url): if self.whitelist_tags is not None and el.tag not in self.whitelist_tags: return False (scheme, netloc, path, query, fragment) = urlsplit(url) netloc = netloc.lower().split(':', 1)[0] if scheme not in ('http', 'https'): return False if netloc in self.host_whitelist: return True return False def kill_conditional_comments(self, doc): bad = [] self._kill_elements(doc, (lambda el: _conditional_comment_re.search(el.text)), etree.Comment) def _kill_elements(self, doc, condition, iterate = None): bad = [] for el in doc.iter(iterate): if condition(el): bad.append(el) continue for el in bad: el.drop_tree() def _remove_javascript_link(self, link): new = _substitute_whitespace('', link) if _javascript_scheme_re.search(new): return '' return link _substitute_comments = re.compile('/\\*.*?\\*/', re.S).sub def _has_sneaky_javascript(self, style): style = self._substitute_comments('', style) style = style.replace('\\', '') style = _substitute_whitespace('', style) style = style.lower() if 'javascript:' in style: return True if 'expression(' in style: return True return False def clean_html(self, html): result_type = type(html) if isinstance(html, basestring): doc = fromstring(html) else: doc = copy.deepcopy(html) self(doc) return _transform_result(result_type, doc) clean = Cleaner() clean_html = clean.clean_html _link_regexes = [ re.compile('(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\\-_.,a-z0-9%&?;=~]*)?)', re.I), re.compile('mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I)] _avoid_elements = [ 'textarea', 'pre', 'code', 'head', 'select', 'a'] _avoid_hosts = [ re.compile('^localhost', re.I), re.compile('\\bexample\\.(?:com|org|net)$', re.I), re.compile('^127\\.0\\.0\\.1$')] _avoid_classes = [ 'nolink'] def autolink(el, link_regexes = _link_regexes, avoid_elements = _avoid_elements, avoid_hosts = _avoid_hosts, avoid_classes = _avoid_classes): if el.tag in avoid_elements: return None class_name = el.get('class') if class_name: class_name = class_name.split() for match_class in avoid_classes: if match_class in class_name: return None continue for child in list(el): autolink(child, link_regexes = link_regexes, avoid_elements = avoid_elements, avoid_hosts = avoid_hosts, avoid_classes = avoid_classes) if child.tail: (text, tail_children) = _link_text(child.tail, link_regexes, avoid_hosts, factory = el.makeelement) if tail_children: child.tail = text index = el.index(child) el[index + 1:index + 1] = tail_children tail_children if el.text: (text, pre_children) = _link_text(el.text, link_regexes, avoid_hosts, factory = el.makeelement) if pre_children: el.text = text el[:0] = pre_children def _link_text(text, link_regexes, avoid_hosts, factory): leading_text = '' links = [] last_pos = 0 while None: (best_match, best_pos) = (None, None) for regex in link_regexes: regex_pos = last_pos while None: match = regex.search(text, pos = regex_pos) if match is None: break host = match.group('host') for host_regex in avoid_hosts: if host_regex.search(host): regex_pos = match.end() break continue else: break continue if match is None: continue if best_pos is None or match.start() < best_pos: best_match = match best_pos = match.start() continue if best_match is None: if links: links[-1].tail = text else: leading_text = text break link = best_match.group(0) end = best_match.end() if link.endswith('.') or link.endswith(','): end -= 1 link = link[:-1] prev_text = text[:best_match.start()] if links: links[-1].tail = prev_text else: leading_text = prev_text anchor = factory('a') body = best_match.group('body') if not body: body = link if body.endswith('.') or body.endswith(','): body = body[:-1] anchor.text = body links.append(anchor) text = text[end:] continue return (leading_text, links) def autolink_html(html, *args, **kw): result_type = type(html) if isinstance(html, basestring): doc = fromstring(html) else: doc = copy.deepcopy(html) autolink(doc, *args, **kw) return _transform_result(result_type, doc) autolink_html.__doc__ = autolink.__doc__ _avoid_word_break_elements = [ 'pre', 'textarea', 'code'] _avoid_word_break_classes = [ 'nobreak'] def word_break(el, max_width = 40, avoid_elements = _avoid_word_break_elements, avoid_classes = _avoid_word_break_classes, break_character = unichr(8203)): if el.tag in _avoid_word_break_elements: return None class_name = el.get('class') if class_name: dont_break = False class_name = class_name.split() for avoid in avoid_classes: if avoid in class_name: dont_break = True break continue if dont_break: return None if el.text: el.text = _break_text(el.text, max_width, break_character) for child in el: word_break(child, max_width = max_width, avoid_elements = avoid_elements, avoid_classes = avoid_classes, break_character = break_character) if child.tail: child.tail = _break_text(child.tail, max_width, break_character) continue def word_break_html(html, *args, **kw): result_type = type(html) doc = fromstring(html) word_break(doc, *args, **kw) return _transform_result(result_type, doc) def _break_text(text, max_width, break_character): words = text.split() for word in words: if len(word) > max_width: replacement = _insert_break(word, max_width, break_character) text = text.replace(word, replacement) continue return text _break_prefer_re = re.compile('[^a-z]', re.I) def _insert_break(word, width, break_character): orig_word = word result = '' while len(word) > width: start = word[:width] breaks = list(_break_prefer_re.finditer(start)) if breaks: last_break = breaks[-1] if last_break.end() > width - 10: start = word[:last_break.end()] result += start + break_character word = word[len(start):] result += word return result